from sklearn.feature_extraction.text import CountVectorizer

data = ["life is short, i like python",
"life is too long, i dislike python"]

# 创建一个实例
cv = CountVectorizer()

# 特征提取 fit提取特征+transform数据转换，转换为词频矩阵
# cv.fit(data)
# print(cv.get_feature_names())
# result = cv.transform(data)
# print(result.toarray())
result = cv.fit_transform(data) #todo 将上面两步合并

# 特征名,将不同的单词统计出来
print(cv.get_feature_names())
# ['dislike', 'is', 'life', 'like', 'long', 'python', 'short', 'too']

# 特征值
print(result.toarray())
# [[0 1 1 1 0 1 1 0]
#  [1 1 1 0 1 1 0 1]]

# 稀疏矩阵
print(result)




